Packages used

library(tidyverse)
library(data.table)
library(DT)
library(ggthemes)
library(leaflet)
library(ggmap)
library(plotly)
library(lubridate)

First few variables

airbnb_nyc <- fread("AB_NYC_2019.csv")

head(airbnb_nyc) %>% 
    datatable(options = list(scrollX = TRUE))

Neighbourhood Disitribution

p1 <- airbnb_nyc[, .(freq = .N), by =  neighbourhood_group] %>%
    .[, perc := round(freq/sum(freq) *100, 2)] %>%
    ggplot(aes(neighbourhood_group, perc))+
    geom_bar(stat = "identity", width = 0.5, fill ="slategray2"  ) +
    geom_text(aes(neighbourhood_group, perc, label = paste0(perc, "%")),
              position = position_dodge(width = 0.5),
              vjust = 0.07)+
    theme_fivethirtyeight() 
p1

Summary Stats for price based on location

airbnb_nyc[, price := as.double(price)]

summary_function <- function(by_col){
    
    summary_stats <- airbnb_nyc[!is.na(price)&price !=0, 
                            .(Mean = round(mean(price), 2), 
                              Median = median(price),
                              First_quartile = quantile(price, .25),
                              Third_quartile = quantile(price, .75),
                              Min = min(price),
                              Max = max(price)),
                            by = by_col]
    return(summary_stats)
}


datatable(summary_function(by_col = "neighbourhood_group"))

Summary Stats price based on room type

datatable(summary_function(by_col = "room_type"))

Median price roomtype in different neighbourhoods

airbnb_nyc[, .(Median = median(price)), 
           by = .(neighbourhood_group, room_type)] %>%
  dcast(neighbourhood_group ~room_type, value.var = "Median") %>%
  datatable()

Get map using ggmap

newyork_map <- get_map(c(left = min(airbnb_nyc$longitude) - .0001,
                         bottom = min(airbnb_nyc$latitude) - .0001,
                         right = max(airbnb_nyc$longitude) + .0001,
                         top = max(airbnb_nyc$latitude) + .0001),
                       maptype = "watercolor", source = "osm")

Mapping function

map_plot <- function(df, color_col, continues_color_col = TRUE){
  
  if(continues_color_col) {
    
    scale_fill <- scale_color_viridis_c()
    
    } else{
      
      scale_fill <- scale_color_viridis_d()
      
    }
    
    
  ggmap(newyork_map) +
    geom_point(data =df, 
               aes_string("longitude", "latitude",
                          color = color_col), size = 1)+
    theme(legend.position = "bottom")+
                           
    scale_fill
  
  
}

Newyork price map

per95 <- airbnb_nyc[, quantile(price, 0.95)]
map_plot(df = airbnb_nyc[price <=per95  ], 
         color_col = "price")  

Categorise price variable

breaks <-  quantile(airbnb_nyc$price, seq(0, 1, by = .1))

airbnb_nyc[, price_factor := cut(price, breaks = breaks,
                                 include.lowest = TRUE)]

map_plot(df = airbnb_nyc, 
         color_col = "price_factor",
         continues_color_col = FALSE)  

Reviews per month

  • We can use this as a proxy of host receiving a lot of guests
  • we could use this for instance to check if some neighborhoods are more popular
per95_rev <- airbnb_nyc[!is.na(last_review) & !is.na(reviews_per_month),
                        quantile(reviews_per_month, 0.95)]

map_plot(df = airbnb_nyc[reviews_per_month < per95_rev ], 
         color_col = "reviews_per_month")  

Dates

airbnb_nyc[, last_review := ymd(last_review)]
airbnb_nyc[, summary(last_review)]
##         Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
## "2011-03-28" "2018-07-08" "2019-05-19" "2018-10-04" "2019-06-23" "2019-07-08" 
##         NA's 
##      "10052"
  • Work in progress !